Gather

沿着给定的轴 axis,根据 indices 张量提供的索引值,从 input 张量中收集数据。支持 batch_dims 指定的批处理维度,即在前 batch_dims 个维度上,索引和输入是对应的。

\[\begin{split}\text{output}[i_0, ..., i_{axis-1}, j_0, ..., j_{indices\_ndim-batch\_dims-1}, i_{axis+1}, ..., i_{input\_ndim-1}] = \\ \text{input}[i_0, ..., i_{axis-1}, \text{indices}[i_0, ..., i_{batch\_dims-1}, j_0, ..., j_{indices\_ndim-batch\_dims-1}], i_{axis+1}, ..., i_{input\_ndim-1}]\end{split}\]
输入:
  • output - 计算结果输出地址。

  • input - 输入源张量数据地址。

  • input_shape - 输入张量的形状数组地址。

  • input_ndim - 输入张量的维度数量。

  • indices - 索引张量数据地址(通常为 int32 类型)。

  • indices_shape - 索引张量的形状数组地址。

  • indices_ndim - 索引张量的维度数量。

  • axis - 沿着哪个轴进行聚集操作。

  • batch_dims - 批处理维度数量。

  • core_mask(int, 可选) - 核掩码(仅适用于共享存储版本)。

输出:
  • output - 聚集后的计算结果。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持 int8, int16, int32, fp32, fp64, cplx64, cplx128

  • MT7004 支持 fp16, fp32, int16, int32, cplx64

  • 索引张量 indices 内部存储的索引值必须在 [0, input_shape[axis]) 范围内,否则行为未定义。

  • 聚集操作涉及非连续访存,在大规模数据下建议使用共享存储版本并行处理。

共享存储版本:

void i8_gather_s(int8_t *output, int8_t *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void i16_gather_s(int16_t *output, int16_t *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void i32_gather_s(int32_t *output, int32_t *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void hp_gather_s(half *output, half *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void fp_gather_s(float *output, float *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void dp_gather_s(double *output, double *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void c64_gather_s(float *output, float *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)
void c128_gather_s(double *output, double *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims, int core_mask)

C调用示例:

 1// FT78NE 示例:多核并行聚集操作
 2#include <stdio.h>
 3#include "78NE/utils.h"
 4
 5int main() {
 6    float *input = (float *)0xA0000000;
 7    int *indices = (int *)0xB0000000;
 8    float *output = (float *)0xC0000000;
 9    int input_shape[] = {16, 800, 80};
10    int indices_shape[] = {16, 400};
11    int input_ndim = 3;
12    int indices_ndim = 2;
13    int axis = 1;
14    int batch_dims = 1;
15    int core_mask = 0xFF; // 使用8核并行
16
17    fp_gather_s(output, input, input_shape, input_ndim, indices, indices_shape, indices_ndim, axis, batch_dims, core_mask);
18    return 0;
19}

私有存储版本:

void i8_gather_p(int8_t *output, int8_t *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void i16_gather_p(int16_t *output, int16_t *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void i32_gather_p(int32_t *output, int32_t *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void hp_gather_p(half *output, half *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void fp_gather_p(float *output, float *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void dp_gather_p(double *output, double *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void c64_gather_p(float *output, float *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)
void c128_gather_p(double *output, double *input, int *input_shape, int input_ndim, int *indices, int *indices_shape, int indices_ndim, int axis, int batch_dims)

C调用示例:

 1// MT7004 示例:单核聚集操作
 2#include <stdio.h>
 3
 4int main() {
 5    float *input = (float *)0x10000000;
 6    int *indices = (int *)0x10010000;
 7    float *output = (float *)0x10020000;
 8    int input_shape[] = {2, 100, 10};
 9    int indices_shape[] = {2, 50};
10    int input_ndim = 3;
11    int indices_ndim = 2;
12    int axis = 1;
13    int batch_dims = 1;
14
15    fp_gather_p(output, input, input_shape, input_ndim, indices, indices_shape, indices_ndim, axis, batch_dims);
16    return 0;
17}